/*==============================================================================
FILE NAME: Process_Zipcode_Census_Demographics.do
INPUTS: raw Zipcode Decennial Census Demographics data: R12907124.dct, 
R12907124_SL860.txt
OUTPUTS: zipcode_demographics_new.dta
CREATED: 27 September 2023
UPDATED: 2 November 2023
==============================================================================*/

clear all

/* Set directory if working independently through code
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 
*/

set more off

// Import raw demographic data using dictionary and text file
infile using "$raw_data/Zipcode Decennial Census Demographics/R12907124.dct", using("$raw_data/Zipcode Decennial Census Demographics/R12907124_SL860.txt")

// Standardize ZipCode variable
rename ZCTA5 ZipCode
destring ZipCode, replace force

// Keep only valid Texas zip codes
drop if ZipCode < 73301 
drop if ZipCode < 75001 & ZipCode != 73301
drop if ZipCode > 88589
drop if ZipCode < 88510 & ZipCode > 79999 & ZipCode != .

// Rename and create population variables
rename T001_001 tot_pop
rename T002_002 urban_pop
rename T002_005 rural_pop
rename T003_001 pop_density
gen urban_share = urban_pop/tot_pop

// Create race/ethnicity share variables
gen hisp_share = T015_010/T015_001 
gen white_share = T015_003/T015_001
gen black_share = T015_004/T015_001
egen other = rsum(T015_005 T015_006 T015_007 T015_008 T015_009)
gen other_share = other/T015_001

// Household and education variables
rename T026_001 avg_hh_size
gen less_HS_share = T040_002/T040_001 
egen college_pop = rsum(T040_004 T040_005 T040_006 T040_007 T040_008) 
gen college_share = college_pop/T040_001

// Labor force and unemployment variables
gen LFPR = T069_002/T069_001 
gen unemployment_rate = T069_006/T069_002

// Income and housing variables
rename T093_001 median_hh_inc
rename T160_001 median_year_built
rename T163_001 median_hh_value
rename T167_001 median_rent
rename T168_001 median_rent_share

// Poverty variables
gen child_poverty_rate = T180_002/T180_001 
egen adult_pop = rsum(T181_001 T182_001)
egen adult_pov = rsum(T181_002 T182_002)
gen adult_pov_rate = adult_pov/adult_pop

// Commute time variable
rename T218_001 avg_commute_time

// Keep only relevant variables for output
keep ZipCode tot_pop urban_share urban_pop rural_pop pop_density avg_hh_size median_hh_inc median_year_built median_hh_value median_rent median_rent_share avg_commute_time hisp_share white_share black_share other_share less_HS_share college_share LFPR unemployment_rate child_poverty_rate adult_pov_rate

// Save cleaned and processed demographics data
save "$processed_data/zipcode_demographics_new.dta", replace